{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "2aa45358",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "import numpy as np\n",
    "from collections import defaultdict\n",
    "from glob import glob\n",
    "from tqdm import tqdm\n",
    "import soundfile as sf\n",
    "import re\n",
    "from multiprocess import Pool\n",
    "import itertools\n",
    "\n",
    "def chunks(l, n):\n",
    "    for i in range(0, len(l), n):\n",
    "        yield (l[i: i + n], i // n)\n",
    "\n",
    "def multiprocessing(strings, function, cores=6, returned=True):\n",
    "    df_split = chunks(strings, len(strings) // cores)\n",
    "    pool = Pool(cores)\n",
    "    pooled = pool.map(function, df_split)\n",
    "    pool.close()\n",
    "    pool.join()\n",
    "\n",
    "    if returned:\n",
    "        return list(itertools.chain(*pooled))\n",
    "\n",
    "timestamps = [i * 0.02 for i in range(1500 + 1)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "ca91b872",
   "metadata": {},
   "outputs": [],
   "source": [
    "from huggingface_hub import hf_hub_download\n",
    "import fasttext\n",
    "\n",
    "filename = hf_hub_download(\n",
    "    repo_id=\"mesolitica/fasttext-language-detection-bahasa-en\", \n",
    "    filename=\"fasttext.ftz\"\n",
    ")\n",
    "lang_model = fasttext.load_model(filename)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "1c8bf0c6",
   "metadata": {},
   "outputs": [],
   "source": [
    "def chunk(alignment, reject = -6.5, minimum_length = 1.0):\n",
    "    alls, temp = [], []\n",
    "    for a in alignment:\n",
    "        if a['score'] <= reject:\n",
    "            if len(temp):\n",
    "                if (temp[-1]['end'] - temp[0]['start']) >= minimum_length:\n",
    "                    temp[-1]['end'] = float(temp[-1]['end']) + 0.1\n",
    "                    alls.append(temp)\n",
    "                temp = []\n",
    "        else:\n",
    "            temp.append(a)\n",
    "            \n",
    "    if len(temp):\n",
    "        if (temp[-1]['end'] - temp[0]['start']) >= minimum_length:\n",
    "            temp[-1]['end'] = float(temp[-1]['end']) + 0.1\n",
    "            alls.append(temp)\n",
    "    return alls"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "6b7a8fd4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://huggingface.co/datasets/mesolitica/pseudolabel-imda-large-v3-timestamp/resolve/main/prepared-imda.jsonl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "7cddacd9",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "1861125it [00:08, 229570.83it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "1861082"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = defaultdict(list)\n",
    "with open('prepared-imda.jsonl') as fopen:\n",
    "    for no, l in tqdm(enumerate(fopen)):\n",
    "        l = json.loads(l)\n",
    "        data[l['audio_filename']].append((no, l))\n",
    "        \n",
    "len(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "77267581",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1861082"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rows = list(data.values())\n",
    "len(rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "e5235796",
   "metadata": {},
   "outputs": [],
   "source": [
    "!rm -rf prepared-imda-chunks\n",
    "!mkdir prepared-imda-chunks"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
